# -*- coding: utf-8 -*-
import scrapy
from wallpaperscraft.items import WallpaperscraftItem


class WallpaperSpider(scrapy.Spider):
    name = 'wallpaper'
    allowed_domains = ['wallpaperscraft.com']
    start_urls = ['http://wallpaperscraft.com']

    header = {
        'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'
    }

    def parse(self, response):
        """
        进入网站，获取左边栏的每一个分类，然后逐个分类开始爬取
        :param response:
        :return:
        """
        wallpaper = response.xpath('//a[@class="filter__link"]/@href').extract()
        catalog_urls = [str(i) for i in wallpaper if '/all' not in str(i)]
        for c in catalog_urls:
            catalog_name = c.replace('/catalog/', '')
            # print(catalog_name)
            first_page = ''.join(self.start_urls) + str(c) + '/1920x1080'
            yield scrapy.Request(url=first_page, callback=self.parse_page1, meta={'catalog': catalog_name},
                                 headers=self.header)

    def parse_page1(self, response):
        catalog_urls = response.xpath('//a[@class="wallpapers__link"]/@href').extract()
        for c in catalog_urls:
            details_url = ''.join(self.start_urls) + str(c)
            print(details_url)
            yield scrapy.Request(url=details_url, callback=self.parse_download,
                                 meta={'catalog': response.meta['catalog']},
                                 headers=self.header)

        pager_link = response.xpath('//a[@class="pager__link"]/@href').extract()
        print(pager_link)
        base_url = str(pager_link[0]).replace('/page2', '/page')
        min_page = int(str(pager_link[0]).rsplit('/page')[-1])
        max_page = int(str(pager_link[-1]).rsplit('/page')[-1])
        for page in range(min_page, max_page):
            pages_url = ''.join(self.start_urls) + base_url + str(page)
            print(pages_url)
            yield scrapy.Request(url=pages_url, callback=self.parse_pages,
                                 meta={'catalog': response.meta['catalog']},
                                 headers=self.header)

    def parse_pages(self, response):
        catalog_urls = response.xpath('//a[@class="wallpapers__link"]/@href').extract()
        for c in catalog_urls:
            details_url = ''.join(self.start_urls) + str(c)
            print(details_url)
            yield scrapy.Request(url=details_url, callback=self.parse_download,
                                 meta={'catalog': response.meta['catalog']},
                                 headers=self.header)

    def parse_download(self, response):
        item = WallpaperscraftItem()
        download_url = response.xpath('//img[@class="wallpaper__image"]/@src').extract()
        item['catalog'] = str(response.meta['catalog'])
        item['url'] = str(''.join(download_url))
        yield item
